library(SingleCellExperiment)
library(tidyverse)
library(igraph)
library(scran)
devtools::load_all("~/miloR/")
## Setup to use python in Rmd
library(reticulate)
reticulate::use_condaenv("emma_env")
theme_dimred <- function( ... ){
theme(axis.ticks =element_blank(), axis.text = element_blank(), plot.title = element_text(hjust=0.5), ... )
}
import scanpy as sc
import pandas as pd
import numpy as np
## To show plots inline
import matplotlib.pyplot as plt
import sys,os
plt.switch_backend('agg')
sys.path.insert(1, '/nfs/team205/ed6/bin/thATAC/preprocess_utils/')
import atac_utils
import scipy.sparse
from pathlib import Path
# sc._settings.ScanpyConfig.figdir = Path(r.outdir)
Load anndata object, downloaded from here following the link from Park et al. 2020
rna_adata = sc.read_h5ad("/nfs/team205/ed6/data/Park_scRNAseq/HTA08.v01.A06.Science_human_tcells.raw.h5ad")
rna_adata.X = rna_adata.raw.X
rna_adata.X = scipy.sparse.csc_matrix(rna_adata.X)
Load MOFA projection, to use as reduced dimensionality object
mofa_dims <- read.csv("/nfs/team205/ed6/data/thymus_data/thymus_MOFA_projection.csv") %>%
column_to_rownames("cell")
## Filter just the scRNA-seq cells and factors 4 knn graoh construciton
mofa_dims <- as.matrix(mofa_dims[rownames(py$rna_adata$obs),1:5])
Convert anndata to SingleCellExperiment object
adata <- py$rna_adata
cnt <- t(adata$X)
rownames(cnt) <- adata$var_names$to_list()
colnames(cnt) <- adata$obs_names$to_list()
logCnt <- log2(cnt + 1)
pca <- prcomp(t(vx))
# Create the SingleCellExperiment object
sce <- SingleCellExperiment(assay=list(counts=cnt, logcounts=logCnt), colData = adata$obs)
reducedDim(sce) <- mofa_dims
reducedDimNames(sce) <- "MOFA"
sce
## Save SingleCellExperiment
saveRDS(sce, "/nfs/team205/ed6/data/Park_scRNAseq/HTA08.v01.A06.Science_human_tcells.SingleCellExperiment.RDS")
sce <- readRDS("~/Downloads/HTA08.v01.A06.Science_human_tcells.SingleCellExperiment.RDS")
sce
class: SingleCellExperiment
dim: 33694 50514
metadata(0):
assays(2): counts logcounts
rownames(33694): TSPAN6 TNMD ... RP11-107E5.4 RP11-299P2.2
rowData names(0):
colnames(50514): FCAImmP7179369-AAACCTGAGCCCAATT FCAImmP7179369-AAACCTGAGCCTATGT ...
Human_colon_16S7985397-TTTGCGCAGAGCCCAA Human_colon_16S7985397-TTTGGTTAGTACGCGA
colData names(14): Sample donor ... cell types umap_density_Age
reducedDimNames(1): MOFA
altExpNames(0):
object.size(sce)
16486280 bytes
Cells in different samples where sorted using different FACS gates, which affect significantly the cell type composition of different samples. To simplify interpretation of DA analysis, I retain only samples that where obtained from total tissue and CD45+ cells, which have a similar cell type composition.
keep_cells <- which(sce$sort %in% c("TOT", "45P"))
sce <- sce[,keep_cells]
sce
class: SingleCellExperiment
dim: 33694 38081
metadata(0):
assays(2): counts logcounts
rownames(33694): TSPAN6 TNMD ... RP11-107E5.4 RP11-299P2.2
rowData names(0):
colnames(38081): FCAImmP7179369-AAACCTGAGCCCAATT FCAImmP7179369-AAACCTGAGCCTATGT ...
Human_colon_16S7985397-TTTGCGCAGAGCCCAA Human_colon_16S7985397-TTTGGTTAGTACGCGA
colData names(14): Sample donor ... cell types umap_density_Age
reducedDimNames(1): MOFA
altExpNames(0):
Make Milo object
milo <- Milo(sce)
milo
class: Milo
dim: 33694 38081
metadata(0):
assays(2): counts logcounts
rownames(33694): TSPAN6 TNMD ... RP11-107E5.4 RP11-299P2.2
rowData names(0):
colnames(38081): FCAImmP7179369-AAACCTGAGCCCAATT
FCAImmP7179369-AAACCTGAGCCTATGT ...
Human_colon_16S7985397-TTTGCGCAGAGCCCAA
Human_colon_16S7985397-TTTGGTTAGTACGCGA
colData names(14): Sample donor ... cell types umap_density_Age
reducedDimNames(1): MOFA
altExpNames(0):
neighbourhoods dimensions(1): 0
neighbourhoodCounts dimensions(2): 1 1
neighbourDistances dimensions(2): 1 1
graph names(0):
neighbourhoodIndex names(1): 0
object.size(milo)
13060216 bytes
For now I use scran function instead of buildGraph from package because it’s very slow
## Rename MOFA dim reduction as PCA so buildGraph can find it
reducedDim(milo, "PCA") <- reducedDim(milo)
#
# library(BiocNeighbors)
# library(BiocParallel)
# milo <- buildGraph(milo, k = 30)
knn_graph <- buildKNNGraph(reducedDim(milo, "MOFA"), k=50, d=NA, transposed=TRUE)
miloR::graph(milo) <- knn_graph
Run umap
umap_th <- uwot::umap(reducedDim(milo, "MOFA"), n_neighbors=50 )
reducedDim(milo, 'UMAP') <- umap_th
scater::plotUMAP(milo, colour_by="Age", point_size=0.5, point_alpha=0.5) +
facet_wrap('colour_by')
knn_graph <- buildKNNGraph(reducedDim(milo, "MOFA"), k=50, d=NA, transposed=TRUE)
miloR::graph(milo) <- knn_graph
Run umap
umap_th <- uwot::umap(reducedDim(milo, "MOFA"), n_neighbors=50, verbose=TRUE)
reducedDim(milo, 'UMAP') <- umap_th
scater::plotUMAP(milo, colour_by="Age", point_size=0.5, point_alpha=0.5) +
facet_wrap('colour_by')
Sample neighborhoods with refined sampling scheme
# milo@neighbourhoods <- list()
system.time(milo <- makeNeighbourhoods(milo, prop=0.1, k = 50, d=5, refined = TRUE, reduced_dims = "PCA", seed = 43))
Checking valid object
user system elapsed
62.178 1.181 63.665
plotNeighborhoodSizeHist(milo, bins=100)
Make model matrix for testing. I use Age as an ordinal variable for testing.
th.model
(Intercept) Age
F21_TH_45P 1 9
F22_TH_TOT 1 3
F23_TH_45P 1 5
F29_TH_45P 1 10
F29_TH_45P_5GEX 1 10
F30_TH_45P 1 8
F30_TH_45P_5GEX 1 8
F38_TH_45P 1 7
F38_TH_45P_5GEX 1 7
F41_TH_45P 1 9
F41_TH_45P_5GEX 1 9
F45_TH_45P 1 6
F45_TH_45P_5GEX 1 6
F64_TH_TOT_5GEX_1 1 5
F64_TH_TOT_5GEX_2 1 5
C34_TH_TOT_5GEX 1 3
C40_TH_TOT_1 1 1
C40_TH_TOT_2 1 1
C41_TH_TOT_1 1 2
C41_TH_TOT_2 1 2
F74_TH_TOT_5GEX_1 1 4
F74_TH_TOT_5GEX_2 1 4
attr(,"assign")
[1] 0 1
milo <- countCells(milo,
data = data.frame(colData(milo)[,c("Sample","Age")]),
samples = "Sample")
Checking data validity
Setting up matrix with 3544 neighbourhoods
Counting cells in neighbourhoods
nh_counts <- milo@neighbourhoodCounts
milo_res <- testNeighbourhoods(milo, design = ~ Age, data = th.meta)
Performing spatial FDR correction
hist(milo_res$PValue)
milo_res %>%
ggplot(aes(PValue, SpatialFDR)) + geom_point() +
geom_abline(linetype=2)
milo_res %>%
ggplot(aes(SpatialFDR)) + geom_histogram()
milo_res %>%
mutate(is_sig=ifelse(SpatialFDR < 0.05, TRUE, FALSE)) %>%
ggplot(aes(logFC, -log10(SpatialFDR), color=is_sig)) +
geom_point(alpha=0.2)
plotMiloReducedDim(milo, milo_results = milo_res, filter_alpha = NULL)
plotMiloReducedDim(milo, milo_results = milo_res, filter_alpha = 0.1)
Perhaps the most interesting part is that proliferative double positive T cells seem to be divided in 2 groups, those more enriched in early stages and late stages. What are the biological differences between these?